Simulating data

# simu parameters
set.seed(7)
n=200
p=14
r=1
type="scale-free"
plot=TRUE

################
#----- DATA
# simulate graph and omega, then sigma0 and finally counts
data=data_from_scratch(type = type,p = p+r,n = n,signed = FALSE,prob = 5/p,v = 0)
## Generating data from the multivariate normal distribution with the scale-free graph structure....done.
omega=data$omega
hidden=which(diag(omega)%in%sort(diag(omega), decreasing = TRUE)[1:r])[1:r] # on cache les r plus gros
trueClique=which(omega[hidden,-hidden]!=0)
if(plot){
  G=draw_network(1*(omega==1),groupes=1*(diag(omega)==diag(omega)[hidden][1]), 
                 layout="nicely",curv=0,nb=2,pal="black",nodes_label =rep("",p+r))$G
  print(G)
}

Kh  <- omega[hidden,hidden]
Ko  <- omega[-hidden,-hidden]
Koh <- omega[-hidden,hidden]
Km  <- Ko - Koh %*%solve(Kh)%*% t(Koh)
sigmaO=solve(Km)
counts=generator_PLN(sigmaO,covariates = NULL,n=n)

ome_init=omega[c(2:15,1),c(2:15,1)] #ome is for testing
ome=ome_init
diag(ome)=0

Initialization

\(Z_O\) distribution

#----- PLN on counts
# optimization of theta and h(Z_O)

PLNfit<-PLN(counts~1)
## 
##  Initialization...
##  Adjusting a PLN model with full covariance model
##  Post-treatments...
##  DONE!
MO<-PLNfit$var_par$M # MiO = ith row of MO
SO<-PLNfit$var_par$S # SiO = diag(ith row of SO)
sigma_obs=PLNfit$model_par$Sigma

Study of the prefered quantity of noise in this example:

B=20 ; coeff=seq(0.1,4,0.5)
varyCoeff=lapply(coeff,function(c){
  res=cbind( data.frame( t(sapply(1:B, function(x){
    init.mclust(sigma_obs,title="Sigma",
                trueClique = trueClique,n.noise=round(p*c), plot=FALSE)$FPN
  }))),c)
})
varyCoeff=do.call(rbind,varyCoeff)
colnames(varyCoeff)=c("FN","FP","coeff")

varyCoeff %>% mutate(nul = (FN==1 & FP==0)) %>% filter(!nul) %>% dplyr::select(-nul) %>%
  group_by(coeff) %>% summarise(sumFP=sum(FP), sumFN=sum(FN), sum=sum(FP+FN)) %>% 
  kable() %>%
  kable_styling()
coeff sumFP sumFN sum
0.1 2.857143 2.857143 5.714286
0.6 2.571429 3.857143 6.428571
1.1 2.142857 5.142857 7.285714
1.6 1.142857 5.142857 6.285714
2.1 1.714286 6.571429 8.285714
2.6 2.000000 6.285714 8.285714
3.1 2.142857 6.428571 8.571429
3.6 1.000000 7.142857 8.142857
initviasigma=init.mclust((cov2cor(sigma_obs)),title="Sigma",
                         trueClique = trueClique,n.noise=1)$init

initial.param<-initEM(sigma_obs,n=n,cliqueList = list(initviasigma),cst=1.05, pca=TRUE) # quick and dirty modif for initEM to take a covariance matrix as input
omega_init=initial.param$K0
sigma_init=initial.param$Sigma0

pr=prcomp(t(counts[,initviasigma]),scale. = FALSE)
MHinit = matrix(pr$rotation[,1]*pr$sdev[1],nrow=n,ncol=r)

Tree distribution

O=1:p
Wg_init <- matrix(1, p+r, p+r); diag(Wg_init) = 0; Wg_init =Wg_init / sum(Wg_init)
W_init <- matrix(1, p+r, p+r); diag(W_init) = 0; W_init =W_init / sum(W_init)
W_init[O,O] <- EMtree_corZ(cov2cor(sigma_obs),n = n,maxIter = 20)$edges_weight
## 
## Likelihoods: 70.3395 , 70.38846 , 70.38846 , 
## Convergence took 0.36 secs  and  3  iterations.
## Likelihood difference = 1.724393e-08 
## Betas difference = 2.524023e-16

Results

alpha=1

resVEM<-VEMtree(counts,MO,SO,MH=MHinit,omega_init,W_init,Wg_init, eps=5e-3, 
                alpha=1,
                maxIter=6, plot=TRUE,vraiOm=ome_init, condTrack=TRUE)
## 
##  Iter n°1

## , diffinvSigO=3.3799
##  Iter n°2

## , diffinvSigO=1.82113
##  Iter n°3

## , diffinvSigO=0.9517
##  Iter n°4

## , diffinvSigO=0.25152
##  Iter n°5

## , diffinvSigO=0.26242
##  Iter n°6

## , diffinvSigO=0.13304
## VEMtree ran in 3.771secs and 6 iterations.
## Final weights difference: 0.0069

Edges probability

obs=data.frame(value=F_Sym2Vec(resVEM$Pg[1:p,1:p]), key="obs")
hid= data.frame(value=resVEM$Pg[1:p,p+1], key="hid")
rbind(obs, hid)%>%  
  ggplot(aes(value, fill=key, color=key))+geom_histogram()+theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plotVEM(resVEM$Pg,ome,r=1,seuil=0.5)

Precision-Recall curves

values=courbes_seuil(probs = resVEM$Pg,omega = ome,h = 15,seq_seuil = seq(0,1,0.02))
values %>% mutate(crit = PPV+TPR) %>% filter(crit==max(crit, na.rm=TRUE)) %>%
  summarise(mins=min(seuil), maxs=max(seuil), PPV=max(PPV), PPVO=max(PPVO),PPVH=max(PPVH), 
            TPR=max(TPR),TPRO=max(TPRO),TPRH=max(TPRH)) %>% 
  kable() %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "float_right")
mins maxs PPV PPVO PPVH TPR TPRO TPRH
0.16 0.64 0.87 0.88 0.86 0.93 1 0.86

Ce tableau repère pour quelles plage de seuil on a une valeure maximale pour la quantité PPV+TPR, et affiche les valeurs correspondantes pour PPVO, PPVH, TPRO et TPRH.

plotVerdict(values, seuil)+guides(color=FALSE)

Omega estimation

alpha=0.9

resVEM<-VEMtree(counts,MO,SO,MH=MHinit,omega_init,W_init,Wg_init, eps=5e-3, 
                alpha=0.9,
                maxIter=6, plot=TRUE,vraiOm=ome_init, condTrack=TRUE)
## 
##  Iter n°1

## , diffinvSigO=3.3799
##  Iter n°2

## , diffinvSigO=1.80801
##  Iter n°3

## , diffinvSigO=0.95938
##  Iter n°4

## , diffinvSigO=0.25699
##  Iter n°5

## , diffinvSigO=0.04308
## VEMtree ran in 2.455secs and 5 iterations.
## Final weights difference: 9e-04

Edges probability

obs=data.frame(value=F_Sym2Vec(resVEM$Pg[1:p,1:p]), key="obs")
hid= data.frame(value=resVEM$Pg[1:p,p+1], key="hid")
rbind(obs, hid)%>%  
  ggplot(aes(value, fill=key, color=key))+geom_histogram()+theme_light()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plotVEM(resVEM$Pg,ome,r=1,seuil=0.5)

Precision-Recall curves

values=courbes_seuil(probs = resVEM$Pg,omega = ome,h = 15,seq_seuil = seq(0,1,0.02))
values %>% mutate(crit = PPV+TPR) %>% filter(crit==max(crit, na.rm=TRUE)) %>%
  summarise(mins=min(seuil), maxs=max(seuil), PPV=max(PPV), PPVO=max(PPVO),PPVH=max(PPVH), 
            TPR=max(TPR),TPRO=max(TPRO),TPRH=max(TPRH)) %>% 
  kable() %>% kable_styling()
mins maxs PPV PPVO PPVH TPR TPRO TPRH
0.14 0.66 0.87 0.88 0.86 0.93 1 0.86
plotVerdict(values, seuil)+guides(color=FALSE)

Omega estimation